{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. ID - 唯一ID（不能用于预测）\n",
    "2. Gender - 性别\n",
    "3. City - 城市\n",
    "4. Monthly_Income - 月收入（以卢比为单位）\n",
    "5. DOB - 出生日期\n",
    "6. Lead_Creation_Date - 潜在（贷款）创建日期\n",
    "7. Loan_Amount_Applied - 贷款申请请求金额（印度卢比，INR）\n",
    "8. Loan_Tenure_Applied - 贷款申请期限（单位为年）\n",
    "9. Existing_EMI -现有贷款的EMI（EMI：电子货币机构许可证） \n",
    "10. Employer_Name雇主名称\n",
    "11. Salary_Account - 薪资帐户银行\n",
    "12. Mobile_Verified - 是否移动验证（Y / N）\n",
    "13. VAR5 - 连续型变量\n",
    "14. VAR1-  类别型变量\n",
    "15. Loan_Amount_Submitted - 提交的贷款金额（在看到资格后修改和选择）\n",
    "16. Loan_Tenure_Submitted - 提交的贷款期限（单位为年，在看到资格后修改和选择）\n",
    "17. Interest_Rate - 提交贷款金额的利率\n",
    "18. Processing_Fee - 提交贷款的处理费（INR）\n",
    "19. EMI_Loan_Submitted -提交的EMI贷款金额（INR）\n",
    "20. Filled_Form - 后期报价后是否已填写申请表格\n",
    "21. Device_Type - 进行申请的设备（浏览器/移动设备）\n",
    "22. Var2 - 类别型变量\n",
    "23. Source - 类别型变量\n",
    "24. Var4 - 类别型变量\n",
    "\n",
    "输出：\n",
    "25. LoggedIn - 是否login（只用于理解问题的变量，不能用于预测，测试集中没有）\n",
    "26. Disbursed - 是否发放贷款（目标变量），1为发放贷款（目标客户）\n",
    "\n",
    "二、作业要求：\n",
    "1. 适当的特征工程（20分）\n",
    "2. 用LightGBM完成任务，并用交叉验证对模型的超参数（learning_rate、n_estimators、num_leaves、max_depth、min_data_in_leaf、colsample_bytree、subsample）进行调优。（70分）\n",
    "或者用XGBoost完成任务，并用交叉验证对模型的超参数（learning_rate、n_estimators、max_depth、min_child_weight、colsample_bytree、subsample、reg_lambda、reg_）进行调优。\n",
    "3. 对最终模型给出特征重要性（10分）\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#首先 import 必要的模块\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "#color = sns.color_palette()\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/sw/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (12,18) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>City</th>\n",
       "      <th>Monthly_Income</th>\n",
       "      <th>DOB</th>\n",
       "      <th>Lead_Creation_Date</th>\n",
       "      <th>Loan_Amount_Applied</th>\n",
       "      <th>Loan_Tenure_Applied</th>\n",
       "      <th>Existing_EMI</th>\n",
       "      <th>Employer_Name</th>\n",
       "      <th>...</th>\n",
       "      <th>Interest_Rate</th>\n",
       "      <th>Processing_Fee</th>\n",
       "      <th>EMI_Loan_Submitted</th>\n",
       "      <th>Filled_Form</th>\n",
       "      <th>Device_Type</th>\n",
       "      <th>Var2</th>\n",
       "      <th>Source</th>\n",
       "      <th>Var4</th>\n",
       "      <th>LoggedIn</th>\n",
       "      <th>Disbursed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ID000002C20</td>\n",
       "      <td>Female</td>\n",
       "      <td>Delhi</td>\n",
       "      <td>20000</td>\n",
       "      <td>23-May-78</td>\n",
       "      <td>15-May-15</td>\n",
       "      <td>300000.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>CYBOSOL</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N</td>\n",
       "      <td>Web-browser</td>\n",
       "      <td>G</td>\n",
       "      <td>S122</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ID000004E40</td>\n",
       "      <td>Male</td>\n",
       "      <td>Mumbai</td>\n",
       "      <td>35000</td>\n",
       "      <td>07-Oct-85</td>\n",
       "      <td>04-May-15</td>\n",
       "      <td>200000.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>TATA CONSULTANCY SERVICES LTD (TCS)</td>\n",
       "      <td>...</td>\n",
       "      <td>13.25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6762.9</td>\n",
       "      <td>N</td>\n",
       "      <td>Web-browser</td>\n",
       "      <td>G</td>\n",
       "      <td>S122</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ID000007H20</td>\n",
       "      <td>Male</td>\n",
       "      <td>Panchkula</td>\n",
       "      <td>22500</td>\n",
       "      <td>10-Oct-81</td>\n",
       "      <td>19-May-15</td>\n",
       "      <td>600000.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ALCHEMIST HOSPITALS LTD</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N</td>\n",
       "      <td>Web-browser</td>\n",
       "      <td>B</td>\n",
       "      <td>S143</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ID000008I30</td>\n",
       "      <td>Male</td>\n",
       "      <td>Saharsa</td>\n",
       "      <td>35000</td>\n",
       "      <td>30-Nov-87</td>\n",
       "      <td>09-May-15</td>\n",
       "      <td>1000000.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>BIHAR GOVERNMENT</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N</td>\n",
       "      <td>Web-browser</td>\n",
       "      <td>B</td>\n",
       "      <td>S143</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ID000009J40</td>\n",
       "      <td>Male</td>\n",
       "      <td>Bengaluru</td>\n",
       "      <td>100000</td>\n",
       "      <td>17-Feb-84</td>\n",
       "      <td>20-May-15</td>\n",
       "      <td>500000.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>25000.0</td>\n",
       "      <td>GLOBAL EDGE SOFTWARE</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>N</td>\n",
       "      <td>Web-browser</td>\n",
       "      <td>B</td>\n",
       "      <td>S134</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            ID  Gender       City  Monthly_Income        DOB  \\\n",
       "0  ID000002C20  Female      Delhi           20000  23-May-78   \n",
       "1  ID000004E40    Male     Mumbai           35000  07-Oct-85   \n",
       "2  ID000007H20    Male  Panchkula           22500  10-Oct-81   \n",
       "3  ID000008I30    Male    Saharsa           35000  30-Nov-87   \n",
       "4  ID000009J40    Male  Bengaluru          100000  17-Feb-84   \n",
       "\n",
       "  Lead_Creation_Date  Loan_Amount_Applied  Loan_Tenure_Applied  Existing_EMI  \\\n",
       "0          15-May-15             300000.0                  5.0           0.0   \n",
       "1          04-May-15             200000.0                  2.0           0.0   \n",
       "2          19-May-15             600000.0                  4.0           0.0   \n",
       "3          09-May-15            1000000.0                  5.0           0.0   \n",
       "4          20-May-15             500000.0                  2.0       25000.0   \n",
       "\n",
       "                         Employer_Name    ...    Interest_Rate Processing_Fee  \\\n",
       "0                              CYBOSOL    ...              NaN            NaN   \n",
       "1  TATA CONSULTANCY SERVICES LTD (TCS)    ...            13.25            NaN   \n",
       "2              ALCHEMIST HOSPITALS LTD    ...              NaN            NaN   \n",
       "3                     BIHAR GOVERNMENT    ...              NaN            NaN   \n",
       "4                 GLOBAL EDGE SOFTWARE    ...              NaN            NaN   \n",
       "\n",
       "  EMI_Loan_Submitted Filled_Form  Device_Type  Var2  Source  Var4 LoggedIn  \\\n",
       "0                NaN           N  Web-browser     G    S122     1        0   \n",
       "1             6762.9           N  Web-browser     G    S122     3        0   \n",
       "2                NaN           N  Web-browser     B    S143     1        0   \n",
       "3                NaN           N  Web-browser     B    S143     3        0   \n",
       "4                NaN           N  Web-browser     B    S134     3        1   \n",
       "\n",
       "  Disbursed  \n",
       "0       0.0  \n",
       "1       0.0  \n",
       "2       0.0  \n",
       "3       0.0  \n",
       "4       0.0  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"Train-UTF8.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 87020 entries, 0 to 87019\n",
      "Data columns (total 26 columns):\n",
      "ID                       87020 non-null object\n",
      "Gender                   87020 non-null object\n",
      "City                     86017 non-null object\n",
      "Monthly_Income           87020 non-null int64\n",
      "DOB                      87020 non-null object\n",
      "Lead_Creation_Date       87020 non-null object\n",
      "Loan_Amount_Applied      86949 non-null float64\n",
      "Loan_Tenure_Applied      86949 non-null float64\n",
      "Existing_EMI             86949 non-null float64\n",
      "Employer_Name            86949 non-null object\n",
      "Salary_Account           75256 non-null object\n",
      "Mobile_Verified          87020 non-null object\n",
      "Var5                     87020 non-null object\n",
      "Var1                     87019 non-null object\n",
      "Loan_Amount_Submitted    52407 non-null float64\n",
      "Loan_Tenure_Submitted    52407 non-null float64\n",
      "Interest_Rate            27726 non-null float64\n",
      "Processing_Fee           27420 non-null float64\n",
      "EMI_Loan_Submitted       27727 non-null object\n",
      "Filled_Form              87020 non-null object\n",
      "Device_Type              87020 non-null object\n",
      "Var2                     87020 non-null object\n",
      "Source                   87020 non-null object\n",
      "Var4                     87020 non-null int64\n",
      "LoggedIn                 87020 non-null int64\n",
      "Disbursed                87019 non-null float64\n",
      "dtypes: float64(8), int64(3), object(15)\n",
      "memory usage: 17.3+ MB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 删除无用列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop([\"ID\", \"LoggedIn\"], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEKCAYAAADaa8itAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAFpFJREFUeJzt3Xu0JWV95vHvw02Qi4C2LtNAGmNH\nJTreWoImEy8oAk5oZEGCyxEGmcFkAHESHTHJsuNdEoRI4iWsQLiMw0UGtEWUQQSdOIo0F8UGWXQw\nQg8EWkFEGGEafvPHfg/s7j6Xorv2OWzP97PWXqfqrbdq/7ar5VlVb9VbqSokSerDZnNdgCTpV4eh\nIknqjaEiSeqNoSJJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEiSerNFnNdwGx7xjOeUYsWLZrrMiRp\nbFxzzTU/qaoFXfrOu1BZtGgRK1asmOsyJGlsJPlx175e/pIk9cZQkST1xlCRJPXGUJEk9cZQkST1\nxlCRJPXGUJEk9cZQkST1xlCRJPVm3j1Rv6le/p6z5roEPQld89eHzXUJ0pOCZyqSpN4YKpKk3hgq\nkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTejDRUkvyXJCuT/CDJOUm2TrJ7kquS3JLk\nvCRbtb5Paeur2vZFQ8d5X2u/Ockbh9r3bW2rkhw/yt8iSZrZyEIlyULgncCSqnohsDlwKHACcHJV\nLQbuBY5suxwJ3FtVzwVObv1Iskfb77eAfYFPJ9k8yebAp4D9gD2At7S+kqQ5MurLX1sA2yTZAngq\ncCfwOuCCtv1M4MC2vLSt07bvnSSt/dyqeqiqfgSsAvZsn1VVdWtVPQyc2/pKkubIyEKlqv4PcCJw\nG4MwuQ+4BvhZVa1t3VYDC9vyQuD2tu/a1v/pw+3r7TNVuyRpjozy8tdODM4cdgd+DdiWwaWq9dXE\nLlNse6Ltk9VyVJIVSVasWbNmptIlSRtplJe/Xg/8qKrWVNX/Ay4EXgXs2C6HAewC3NGWVwO7ArTt\nTwPuGW5fb5+p2jdQVadW1ZKqWrJgwYI+fpskaRKjDJXbgL2SPLWNjewN3AhcARzc+hwOfLEtL2/r\ntO1fr6pq7Ye2u8N2BxYD3wWuBha3u8m2YjCYv3yEv0eSNIORvaSrqq5KcgFwLbAWuA44FfgycG6S\nD7e209oupwFnJ1nF4Azl0HaclUnOZxBIa4Gjq+oRgCTHAJcyuLPs9KpaOarfI0ma2Ujf/FhVy4Bl\n6zXfyuDOrfX7/hI4ZIrjfAT4yCTtlwCXbHqlkqQ++ES9JKk3hookqTeGiiSpN4aKJKk3hookqTeG\niiSpN4aKJKk3hookqTeGiiSpN4aKJKk3hookqTeGiiSpN4aKJKk3hookqTeGiiSpN4aKJKk3hook\nqTeGiiSpN4aKJKk3hookqTeGiiSpN4aKJKk3M4ZKkm2TbNaWfzPJAUm2HH1pkqRx0+VM5ZvA1kkW\nApcDRwBnjLIoSdJ46hIqqaoHgYOAv62qNwN7jLYsSdI46hQqSV4JvBX4cmvbYnQlSZLGVZdQeRfw\nPuCiqlqZ5DnAFaMtS5I0jmY846iqbwDfSLJtW78VeOeoC5MkjZ8ud3+9MsmNwE1t/cVJPj3yyiRJ\nY6fL5a+/Ad4I/BSgqr4H/N4oi5IkjadODz9W1e3rNT0yglokSWOuy11ctyd5FVBJtmIwnnLTaMuS\nJI2jLmcqfwQcDSwEVgMvaeuSJK2jy91fP2HwjIokSdPqcvfXmUl2HFrfKcnpoy1LkjSOulz++jdV\n9bOJlaq6F3jp6EqSJI2rLqGyWZKdJlaS7IzTtEiSJtElHD4B/O8kF7T1Q4CPjK4kSdK46jJQf1aS\na4DXAgEOqqobR16ZJGnsdH3z4w+BC4EvAr9IsluXnZLsmOSCJD9MclOb8mXnJJcluaX93an1TZJT\nkqxK8v0kLxs6zuGt/y1JDh9qf3mSG9o+pyRJ958uSepbl7u/jgXuAi4DLmYw/f3FHY//SeCrVfV8\n4MUMHpo8Hri8qhYzeOnX8a3vfsDi9jkK+Ez7/p2BZcBvA3sCy4bGeD7T+k7st2/HuiRJI9BlTOU4\n4HlV9dMncuAkOzCYI+w/AFTVw8DDSZYCr2ndzgSuBN4LLAXOqqoCvtPOcp7d+l5WVfe0414G7Jvk\nSmCHqvp2az8LOBD4yhOpU5LUny6Xv24H7tuIYz8HWAP8Y5LrkvxDmz7/WVV1J0D7+8zWf2H7rgmr\nW9t07asnaZckzZEuZyq3Alcm+TLw0ERjVZ3U4dgvA46tqquSfJLHL3VNZrLxkNqI9g0PnBzF4DIZ\nu+3WaThIkrQRupyp3MZgPGUrYPuhz0xWA6ur6qq2fgGDkLmrXdai/b17qP+uQ/vvAtwxQ/suk7Rv\noKpOraolVbVkwYIFHUqXJG2MLrcUfwAgybZV9UDXA1fVvya5PcnzqupmYG/gxvY5HPh4+/vFtsty\n4Jgk5zIYlL+vqu5Mcinw0aHB+X2A91XVPUnuT7IXcBVwGPC3XeuTJPVvxlBJ8krgNGA7YLckLwbe\nUVX/ucPxjwU+16bMvxU4gsHZ0flJjmRwFnRI63sJsD+wCniw9aWFx4eAq1u/D04M2gN/DJwBbMNg\ngN5BekmaQ13GVCbe/LgcBm9+TNLpzY9VdT2wZJJNe0/St5hiSv2qOh3YYBLLqloBvLBLLZKk0fPN\nj5Kk3vjmR0lSb3zzoySpN9OeqSTZHHhbVfnmR0nSjKY9U6mqRxhMnyJJ0oy6jKl8K8nfAecBjz2n\nUlXXjqwqSdJY6hIqr2p/PzjUVsDr+i9HkjTOZhpT2Qz4TFWdP0v1SJLG2ExjKo8Cx8xSLZKkMdfl\nluLLkrw7ya7trY07txdnSZK0ji5jKm9vf4efTSkG70uRJOkxXWYp3n02CpEkjb8usxQfNll7VZ3V\nfzmSpHHW5fLXK4aWt2Yww/C1gKEiSVpHl8tfxw6vJ3kacPbIKpIkja1OU9+v50Fgcd+FSJLGX5cx\nlS8xuNsLBiG0B+DDkJKkDXQZUzlxaHkt8OOqWj2ieiRJY6xLqNwG3FlVvwRIsk2SRVX1LyOtTJI0\ndrqMqXweeHRo/ZHWJknSOrqEyhZV9fDESlveanQlSZLGVZdQWZPkgImVJEuBn4yuJEnSuOoypvJH\nwOfai7pg8J76SZ+ylyTNb10efvxnYK8k2wGpqvtHX5YkaRzNePkryUeT7FhVv6iq+5PslOTDs1Gc\nJGm8dBlT2a+qfjaxUlX3AvuPriRJ0rjqEiqbJ3nKxEqSbYCnTNNfkjRPdRmo/2/A5Un+kcF0LW8H\nzhxpVZKksdRloP6vknwfeH1r+lBVXTrasiRJ46jLmQrAdcCWDM5UrhtdOZKkcdbl7q8/AL4LHAz8\nAXBVkoNHXZgkafx0OVP5c+AVVXU3QJIFwNeAC0ZZmCRp/HS5+2uziUBpftpxP0nSPNPlTOWrSS4F\nzmnrfwhcMrqSJEnjqsvdX+9JchDwu0CAU6vqopFXJkkaO53u/qqqC4ELR1yLJGnMOTYiSeqNoSJJ\n6s2UoZLk8vb3hNkrR5I0zqYbU3l2klcDByQ5l8Eg/WOq6tqRViZJGjvTXf56P3A8sAtwEvCJoc+J\nXb8gyeZJrktycVvfPclVSW5Jcl6SrVr7U9r6qrZ90dAx3tfab07yxqH2fVvbqiTHd//ZkqRRmDJU\nquqCqtoP+Kuqeu16n9c9ge84DrhpaP0E4OSqWgzcCxzZ2o8E7q2q5wInt34k2QM4FPgtYF/g0y2o\nNgc+BewH7AG8pfWVJM2RGQfqq+pDSQ5IcmL7/LuuB0+yC/Am4B/aeoDX8fgUL2cCB7blpTw+pf4F\nwN6t/1Lg3Kp6qKp+BKwC9myfVVV1a1U9DJzb+kqS5kiXCSU/xuBs48b2Oa61dfE3wH8FHm3rTwd+\nVlVr2/pqYGFbXgjcDtC239f6P9a+3j5TtU/2G45KsiLJijVr1nQsXZL0RHW5pfhNwBuq6vSqOp3B\nJag3zbRTO6O5u6quGW6epGvNsO2Jtm/YWHVqVS2pqiULFiyYpmpJ0qbo+j6VHYF72vLTOu7zOwzu\nHNsf2BrYgcGZy45JtmhnI7sAd7T+q4FdgdVJtmjfc89Q+4ThfaZqlyTNgS5nKh8DrktyRpIzgWuA\nj860U1W9r6p2qapFDAbav15VbwWuYPBuFoDDgS+25eVtnbb961VVrf3QdnfY7sBiBu93uRpY3O4m\n26p9x/IOv0eSNCJdJpQ8J8mVwCsYXHJ6b1X96yZ853uBc5N8mMFbJE9r7acBZydZxeAM5dD2/SuT\nnM9gPGctcHRVPQKQ5BjgUmBz4PSqWrkJdUmSNlHXCSXvZBPOAqrqSuDKtnwrgzu31u/zS+CQKfb/\nCPCRSdovwWn4JelJw7m/JEm9MVQkSb2ZNlSSbJbkB7NVjCRpvE0bKlX1KPC9JLvNUj2SpDHWZaD+\n2cDKJN8FHphorKoDRlaVJGksdQmVD4y8CknSr4Quz6l8I8mvA4ur6mtJnsrguRBJktbRZULJ/8Rg\n1uC/b00LgS+MsihJ0njqckvx0Qzm8fo5QFXdAjxzlEVJksZTl1B5qL2vBIA22eOkswFLkua3LqHy\njSR/BmyT5A3A54EvjbYsSdI46hIqxwNrgBuAdzCYa+svRlmUJGk8dbn769E25f1VDC573dympJck\naR0zhkqSNwGfBf6ZwdT3uyd5R1V9ZdTFSZLGS5eHHz8BvLaqVgEk+Q3gy4ChIklaR5cxlbsnAqW5\nFbh7RPVIksbYlGcqSQ5qiyuTXAKcz2BM5RAGr/KVJGkd013++v2h5buAV7flNcBOI6tIkjS2pgyV\nqjpiNguRJI2/Lnd/7Q4cCywa7u/U95Kk9XW5++sLwGkMnqJ/dLTlSJLGWZdQ+WVVnTLySiRJY69L\nqHwyyTLgfwIPTTRW1bUjq0qSNJa6hMqLgLcBr+Pxy1/V1iVJekyXUHkz8Jzh6e8lSZpMlyfqvwfs\nOOpCJEnjr8uZyrOAHya5mnXHVLylWJK0ji6hsmzkVUiSfiV0eZ/KN2ajEEnS+OvyRP39PP5O+q2A\nLYEHqmqHURYmSRo/Xc5Uth9eT3IgsOfIKpIkja0ud3+to6q+gM+oSJIm0eXy10FDq5sBS3j8cpgk\nSY/pcvfX8HtV1gL/AiwdSTWSpLHWZUzF96pIkjqZ7nXC759mv6qqD42gHknSGJvuTOWBSdq2BY4E\nng4YKpKkdUz3OuFPTCwn2R44DjgCOBf4xFT7SZLmr2nHVJLsDPwJ8FbgTOBlVXXvbBQmSRo/Uz6n\nkuSvgauB+4EXVdVfPpFASbJrkiuS3JRkZZLjWvvOSS5Lckv7u1NrT5JTkqxK8v0kLxs61uGt/y1J\nDh9qf3mSG9o+pyTJRvxvIEnqyXQPP/4p8GvAXwB3JPl5+9yf5Ocdjr0W+NOqegGwF3B0kj2A44HL\nq2oxcHlbB9gPWNw+RwGfgcfOlpYBv83gSf5lE0HU+hw1tN++3X62JGkUpgyVqtqsqrapqu2raoeh\nz/Zd5v2qqjsnXjlcVfcDNwELGTzjcmbrdiZwYFteCpxVA98BdkzybOCNwGVVdU87U7oM2Ldt26Gq\nvl1VBZw1dCxJ0hx4wtO0bIwki4CXAlcBz6qqO2EQPMAzW7eFwO1Du61ubdO1r56kfbLvPyrJiiQr\n1qxZs6k/R5I0hZGHSpLtgP8BvKuqprtsNtl4SG1E+4aNVadW1ZKqWrJgwYKZSpYkbaSRhkqSLRkE\nyueq6sLWfFe7dEX7e3drXw3sOrT7LsAdM7TvMkm7JGmOjCxU2p1YpwE3VdVJQ5uWAxN3cB0OfHGo\n/bB2F9hewH3t8tilwD5JdmoD9PsAl7Zt9yfZq33XYUPHkiTNgS4TSm6s3wHeBtyQ5PrW9mfAx4Hz\nkxwJ3AYc0rZdAuwPrAIeZPCgJVV1T5IPMbi9GeCDVXVPW/5j4AxgG+Ar7SNJmiMjC5Wq+icmH/cA\n2HuS/gUcPcWxTgdOn6R9BfDCTShTktSjWbn7S5I0PxgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4Y\nKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqS\npN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTe\nGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN4YKpKk3hgqkqTeGCqSpN6Mfagk2TfJ\nzUlWJTl+ruuRpPlsi7kuYFMk2Rz4FPAGYDVwdZLlVXXj3FYmzY3bPviiuS5BT0K7vf+GWfuucT9T\n2RNYVVW3VtXDwLnA0jmuSZLmrXEPlYXA7UPrq1ubJGkOjPXlLyCTtNUGnZKjgKPa6i+S3DzSquaP\nZwA/mesingxy4uFzXYI25L/PCcsm+0/lE/LrXTuOe6isBnYdWt8FuGP9TlV1KnDqbBU1XyRZUVVL\n5roOaTL++5wb437562pgcZLdk2wFHAosn+OaJGneGuszlapam+QY4FJgc+D0qlo5x2VJ0rw11qEC\nUFWXAJfMdR3zlJcU9WTmv885kKoNxrUlSdoo4z6mIkl6EjFUNKOZpsJJ8pQk57XtVyVZNPtVaj5K\ncnqSu5P8YIrtSXJK+7f5/SQvm+0a5xtDRdMamgpnP2AP4C1J9liv25HAvVX1XOBk4ITZrVLz2BnA\nvtNs3w9Y3D5HAZ+ZhZrmNUNFM+kyFc5S4My2fAGwd5JNftpKmklVfRO4Z5ouS4GzauA7wI5Jnj07\n1c1Phopm0mUqnMf6VNVa4D7g6bNSnTQ9p3KaZYaKZtJlKpxO0+VIc8B/m7PMUNFMukyF81ifJFsA\nT2P6SxLSbOk0lZP6Y6hoJl2mwlkOTMyoeDDw9fIBKD05LAcOa3eB7QXcV1V3znVRv8rG/ol6jdZU\nU+Ek+SCwoqqWA6cBZydZxeAM5dC5q1jzSZJzgNcAz0iyGlgGbAlQVZ9lMNvG/sAq4EHgiLmpdP7w\niXpJUm+8/CVJ6o2hIknqjaEiSeqNoSJJ6o2hIknqjaEidZTkkSTXJ1mZ5HtJ/iTJZm3bkiSnTLPv\na5JcPHvVbvD9f5nk3XP1/Zo/fE5F6u7/VtVLAJI8E/jvDGYPWFZVK4AVo/riJFu0edWkJzXPVKSN\nUFV3M5hK/Zj2tPZjZyJJXt3OaK5Pcl2S7dtuOyS5KMmNST47dJbzi4njJjk4yRlt+YwkJyW5Ajhh\nquMmeU+Sq9v7Qj4wdKw/b+/B+RrwvNn430XyTEXaSFV1awuGZ6636d3A0VX1rSTbAb9s7XsyeCfN\nj4GvAgcxeFXAdH4TeH1VPZLkS+sfN8k+DN4VsieDyROXJ/k94AEGMxu8lMH/z68Frtm0XyzNzDMV\nadNMNgvut4CTkrwT2HHostV323tpHgHOAX63w/E/3/pPddx92uc6BsHxfAYh82+Bi6rqwar6ORvO\n1yaNhKEibaQkzwEeAe4ebq+qjwP/EdgG+E6S509sWu8QNUn71uv1eWCG4wb4WFW9pH2eW1WnTfF9\n0sgZKtJGSLIA+Czwd+vPyJzkN6rqhqo6gcHg/USo7Nlme94M+EPgn1r7XUle0NrfPM13TnbcS4G3\nt8thJFnYbiL4JvDmJNu0sZff7+u3S9NxTEXqbpsk1zOYBXctcDZw0iT93pXktQzOYm4EvgK8Evg2\n8HHgRQz+o39R6388cDGDNxT+ANhuiu/f4LhV9VCSFwDfbm9w/gXw76vq2iTnAdczGMP5X5vyw6Wu\nnKVYktQbL39JknpjqEiSemOoSJJ6Y6hIknpjqEiSemOoSJJ6Y6hIknpjqEiSevP/AQfaa5oYJqxD\nAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f8c6d2199e8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Target 分布，两类样本分布严重不均衡，只有1.4%的样本Disbursed为1\n",
    "sns.countplot(train['Disbursed']);\n",
    "plt.xlabel('Disbursed');\n",
    "plt.ylabel('Number of occurrences');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 比例分布不均的情况使用 auc评分"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查数据质量：异常点、缺省值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gender                       0\n",
       "City                      1003\n",
       "Monthly_Income               0\n",
       "DOB                          0\n",
       "Lead_Creation_Date           0\n",
       "Loan_Amount_Applied         71\n",
       "Loan_Tenure_Applied         71\n",
       "Existing_EMI                71\n",
       "Employer_Name               71\n",
       "Salary_Account           11764\n",
       "Mobile_Verified              0\n",
       "Var5                         0\n",
       "Var1                         1\n",
       "Loan_Amount_Submitted    34613\n",
       "Loan_Tenure_Submitted    34613\n",
       "Interest_Rate            59294\n",
       "Processing_Fee           59600\n",
       "EMI_Loan_Submitted       59293\n",
       "Filled_Form                  0\n",
       "Device_Type                  0\n",
       "Var2                         0\n",
       "Source                       0\n",
       "Var4                         0\n",
       "Disbursed                    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.apply(lambda x: sum(x.isnull()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 缺失值训练时会报错，缺失值直接用0填充算了"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.fillna(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Var5和EMI_Loan_Submitted有异常数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[\"Var5\"]=train[\"Var5\"].apply(lambda x: '' if str(x)=='HBXX' else x)\n",
    "train[\"EMI_Loan_Submitted\"]=train[\"EMI_Loan_Submitted\"].apply(lambda x: '' if str(x)=='N' else x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理City、Employer_Name、Salary_Account、Source\n",
    "这些特征都是取值很多,\n",
    "取前几个重要的，其余合并成一个：others\n",
    "\n",
    "LightGBM对类别特征建立直方图时，当特征取值数目超过max_bin(默认255)，会去掉样本数目少的类别：\n",
    "统计该特征下每一种离散值出现的次数，并从高到低排序，并过滤掉出现次数较少的特征值, \n",
    "然后为每一个特征值，建立一个bin容器, 对于在bin容器内出现次数较少的特征值直接过滤掉，不建立bin容器。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features = ['City','Employer_Name','Salary_Account', 'Source']\n",
    "rare_thresholds = [100, 30, 40, 40]\n",
    "j=0\n",
    "for col in cat_features:\n",
    "    #每个取值的样本数目\n",
    "    value_counts_col =  train[col].value_counts(dropna=False)\n",
    "\n",
    "    #样本数目小于阈值的取值为稀有取值\n",
    "    rare_threshold = rare_thresholds[j]\n",
    "    value_counts_rare = list(value_counts_col[value_counts_col < rare_threshold ].index)\n",
    "\n",
    "    #稀有值合并为：others\n",
    "    rare_index = train[col].isin(value_counts_rare)\n",
    "    train.loc[ train[col].isin(value_counts_rare), col] = \"Others\"\n",
    "    \n",
    "    j = j+1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理类别性特征，gbm不需要onehot编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "le = LabelEncoder()\n",
    "feats_to_encode = ['City', 'Employer_Name', 'Salary_Account','Device_Type','Filled_Form','Gender','Mobile_Verified','Source','Var1','Var2','Var4']\n",
    "for col in feats_to_encode:\n",
    "    train[col] = le.fit_transform(train[col].astype(str))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理生日 BOD\n",
    "- 转成贷款时的年龄"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#创建一个年龄的字段Age\n",
    "train['Age'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year - pd.to_datetime(train['DOB']).dt.year\n",
    "#data['Age'].head()\n",
    "#把原始的DOB字段去掉:\n",
    "train.drop(['DOB', 'Lead_Creation_Date'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理Loan_Tenure_**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#不合理的贷款年限，设为缺失值\n",
    "train['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True)\n",
    "train['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.to_csv('./FE_train-1.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (已经删掉，不处理了)潜在（贷款）创建日期\n",
    "- 这里只有5,6,7三个月的数据，测试数据也只有5,6,7三个月\n",
    "- 可以直接离散化或者使用星期几来处理\n",
    "    - 这里转换成星期吧"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>City</th>\n",
       "      <th>Monthly_Income</th>\n",
       "      <th>Loan_Amount_Applied</th>\n",
       "      <th>Loan_Tenure_Applied</th>\n",
       "      <th>Existing_EMI</th>\n",
       "      <th>Employer_Name</th>\n",
       "      <th>Salary_Account</th>\n",
       "      <th>Mobile_Verified</th>\n",
       "      <th>Var5</th>\n",
       "      <th>...</th>\n",
       "      <th>Interest_Rate</th>\n",
       "      <th>Processing_Fee</th>\n",
       "      <th>EMI_Loan_Submitted</th>\n",
       "      <th>Filled_Form</th>\n",
       "      <th>Device_Type</th>\n",
       "      <th>Var2</th>\n",
       "      <th>Source</th>\n",
       "      <th>Var4</th>\n",
       "      <th>Disbursed</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>9.903538</td>\n",
       "      <td>12.611541</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>113</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>34</td>\n",
       "      <td>10.463132</td>\n",
       "      <td>12.206078</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>129</td>\n",
       "      <td>18</td>\n",
       "      <td>2</td>\n",
       "      <td>13</td>\n",
       "      <td>...</td>\n",
       "      <td>13.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6762.9</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>40</td>\n",
       "      <td>10.021315</td>\n",
       "      <td>13.304687</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>113</td>\n",
       "      <td>38</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>40</td>\n",
       "      <td>10.463132</td>\n",
       "      <td>13.815512</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>113</td>\n",
       "      <td>38</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>11.512935</td>\n",
       "      <td>13.122365</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.126671</td>\n",
       "      <td>113</td>\n",
       "      <td>16</td>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gender  City  Monthly_Income  Loan_Amount_Applied  Loan_Tenure_Applied  \\\n",
       "0       0    12        9.903538            12.611541                  5.0   \n",
       "1       1    34       10.463132            12.206078                  2.0   \n",
       "2       1    40       10.021315            13.304687                  4.0   \n",
       "3       1    40       10.463132            13.815512                  5.0   \n",
       "4       1     5       11.512935            13.122365                  2.0   \n",
       "\n",
       "   Existing_EMI  Employer_Name  Salary_Account  Mobile_Verified Var5 ...   \\\n",
       "0      0.000000            113              16                1    0 ...    \n",
       "1      0.000000            129              18                2   13 ...    \n",
       "2      0.000000            113              38                2    0 ...    \n",
       "3      0.000000            113              38                2   10 ...    \n",
       "4     10.126671            113              16                2   17 ...    \n",
       "\n",
       "   Interest_Rate  Processing_Fee  EMI_Loan_Submitted  Filled_Form  \\\n",
       "0           0.00             0.0                   0            1   \n",
       "1          13.25             0.0              6762.9            1   \n",
       "2           0.00             0.0                   0            1   \n",
       "3           0.00             0.0                   0            1   \n",
       "4           0.00             0.0                   0            1   \n",
       "\n",
       "   Device_Type Var2  Source  Var4  Disbursed  Age  \n",
       "0            2    6       1     1        0.0   37  \n",
       "1            2    6       1     3        0.0   30  \n",
       "2            2    1       8     1        0.0   34  \n",
       "3            2    1       8     3        0.0   28  \n",
       "4            2    1       5     3        0.0   31  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据标准化\n",
    "#from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# 标准化器\n",
    "#ss = StandardScaler()\n",
    "\n",
    "\n",
    "# 对训练数据，先调用fit方法训练模型，得到模型参数；然后对训练数据和测试数据进行transform\n",
    "#X = ss_X.fit_transform(X)\n",
    "\n",
    "train['Loan_Amount_Applied'] = np.log((1+train['Loan_Amount_Applied'].astype(float)))\n",
    "train['Monthly_Income'] = np.log((1+train['Monthly_Income'].astype(float)))\n",
    "train['Loan_Amount_Submitted'] = np.log((1+train['Loan_Amount_Submitted'].astype(float)))\n",
    "train['Existing_EMI'] = np.log((1+train['Existing_EMI'].astype(float)))\n",
    "train['Processing_Fee'] = np.log((1+train['Processing_Fee'].astype(float)))\n",
    "\n",
    "\n",
    "train.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.to_csv('./FE_train-log.csv',index=False,header=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
